****************************************************************************************
* This file estimates trade costs following Hummels (2007)

* (1) Combine US Imports of Merchandise Data 1999 - 2006 from Feenstra (https://cid.econ.ucdavis.edu/usix.html)
* (2) Add data on distance and oil prices
* (3) Estimate Trade Cost Specification following Hummels (2007)
****************************************************************************************

clear all 
set more off

cd ${data_dir}

use imp99_con.dta, clear

append using imp00_con.dta, force
append using imp01_con.dta, force
append using imp02_con.dta, force
append using imp03_con.dta, force
append using imp04_con.dta, force
append using imp05_con.dta, force
append using imp06_con.dta, force

* consistent time index
replace year = year + 1900

* country codes
gen countrycode = substr(ccode,3,3)
destring countrycode, replace
sort countrycode

merge n:1 countrycode using countrycorrespondence.dta

* fix missing codes manually:
tab countrycode if _merge == 1
replace countryalpha = "DEU" if countrycode == 280
replace countryalpha = "ANT" if countrycode == 532
replace countryalpha = "PAN" if countrycode == 590
replace countryalpha = "YEM" if countrycode == 886
replace countryalpha = "ASM" if countrycode == 888
replace countryalpha = "TWN" if countrycode == 896

drop if _merge == 2
drop _merge

* Add data on distance
merge n:1 countryalpha using distance_us.dta

tab countrycode if _merge == 1
drop if _merge == 2
drop _merge 

* Add data on oil prices
rename year aar
tostring aar, replace
merge n:1 aar using oilprices.dta, keepusing(brent)
tab aar _merge
keep if _merge == 3
drop _merge

rename aar year
destring year, replace

* If country information is unavailable, distance is unknown and observation cannot be used.
drop if countrycode == 0

* First apply the same correspondence to HS 1988/92 as for the Danish data and UN Comtrade:
**********************************************************************************************
gen h1_code = hs6 if year == 1999 | year == 2000 | year == 2001
gen h2_code = hs6 if year == 2002 | year == 2003 | year == 2004 | year == 2005 | year == 2006 

sort h1_code
merge n:1 h1_code using "HS2_1996_1986.dta"
tab year _merge
drop if _merge == 2
drop _merge

sort h2_code
merge n:1 h2_code using "HS2_2002_1986.dta"
tab year _merge
drop if _merge == 2
drop _merge

gen h0_hs2 = h0_hs2_1996 if year == 1999 | year == 2000 | year == 2001
replace h0_hs2 = h0_hs2_2002 if year == 2002 | year == 2003 | year == 2004 | year == 2005 | year == 2006

* Next, Use GDP deflator for the US to get real transport charges
*******************************************************************
gen deflator = 86.8 if year == 1999
replace deflator = 88.7 if year == 2000
replace deflator = 90.7 if year == 2001
replace deflator = 92.2 if year == 2002
replace deflator = 94.1 if year == 2003
replace deflator = 96.8 if year == 2004
replace deflator = 100 if year == 2005
replace deflator = 103.2 if year == 2006

replace charge = 100*charge / deflator
replace gvalue = 100*gvalue / deflator
replace brent = 100*brent / deflator

* Summarize HS2 product groups
egen hs2group = group(h0_hs2)
egen hs2time = group(h0_hs2 year)

* Preparation for EU Membership coefficient
*********************************************

* Define relevant countries:
gen neweugroup = (countrycode == 100 | countrycode == 642 | countrycode == 196 | countrycode == 203 | countrycode == 233 | countrycode == 348 | countrycode == 428 | countrycode == 470 | countrycode == 616 | countrycode == 703 | countrycode == 705)
gen neweu = (countrycode == 100 | countrycode == 642) & year > 2006
replace neweu = (countrycode == 196 | countrycode == 203 | countrycode == 233 | countrycode == 348 | countrycode == 428 | countrycode == 470 | countrycode == 616 | countrycode == 703 | countrycode == 705) & year > 2003

*************************************************************
* Estimate model both with and without weight information:
*************************************************************

* (1) Using Weight-per-value, sample of metric products
********************************************************

* Transform units into KG if possible
replace gquan = gquan*1000 if units == "TON" | units == "CTN"
replace gquan = gquan/1000 if units == "GM"
* indicator for kg/price
gen kgindex = (units == "CTN" | units == "CKG" | units == "CYK" | ///
	units == "KG" | units == "KGS" | units == "GKG" | units == "GM")
tab year kgindex

* Compute key variables for regression
gen wpv = gquan / gvalue
gen advalorem = charge / gvalue
* ad valorem freight cost in logs
gen tradecost = log(advalorem)
* weight per value in logs
gen logwpv = log(wpv)
* log oil price
gen logoil = log(brent)
* log distance 
gen logdist = log(dist)

* How many HS2 groups have weight information?
bys hs2group: egen hs2_kgcheck = max(kgindex)
bys hs2group: gen hs2_eval = _n==1
tab hs2_kgcheck if hs2_eval == 1


* Suppose we estimate coefficients by HS2, how many HS6 product observations can we use per HS2 group?
tab kgindex if hs2_kgcheck == 1
bys hs2group: egen hs2_hs6kg = total(kgindex)
sum hs2_hs6kg if hs2_kgcheck == 1 & hs2_eval == 1, det

* List HS2 groups that we will be missing with weight information:
list h0_hs2 if hs2_eval == 1 & hs2_kgcheck == 0

preserve
keep if kgindex == 1
keep tradecost logwpv logdist logoil year neweu* h0_hs2 hs2group hs2time 
drop if h0_hs2 == ""

set emptycells drop
set matsize 11000

* In order to apply the transport cost model to a different country (Denmark),
* I use interactions of distance, weight-per-value and product, ignoring country info

***********************************************************************
* MODEL B: wpv model with interactions, HS2-specific intercepts
*
* Results correspond to column (C) in Table A.3
***********************************************************************

areg tradecost c.logwpv##c.logdist##c.logoil, absorb(hs2group)
predict hs2groupcoef_B, d
replace hs2groupcoef_B = hs2groupcoef_B + _b[_cons]
gen wpvcoef_B = _b[c.logwpv]
gen distcoef_B = _b[c.logdist]
gen oilcoef_B = _b[c.logoil]
gen distoilcoef_B = _b[c.logdist#c.logoil]
gen wpvoilcoef_B = _b[c.logwpv#c.logoil]
gen wpvdistcoef_B = _b[c.logwpv#c.logdist]
gen wpvdistoilcoef_B = _b[c.logwpv#c.logdist#c.logoil]

outreg2 using "${results_dir}\TableA3.tex", keep(logwpv logdist logoil c.logwpv#c.logdist c.logwpv#c.logoil c.logdist#c.logoil c.logwpv#c.logdist#c.logoil) tex append

* save coefficients of all models by HS2 group:
************************************************
drop if tradecost == .
keep h0_hs2 hs2groupcoef* wpvcoef* distcoef* oilcoef* wpvoilcoef* distoilcoef* wpvdistcoef* wpvdistoilcoef*
bys h0_hs2: keep if _n==1
sort h0_hs2
save tradecost_results_1, replace

restore


* (2) Ignore Weight-per-value, sample of all products
********************************************************

preserve
keep tradecost logdist logoil year neweu* h0_hs2 hs2group hs2time 
drop if h0_hs2 == ""

set emptycells drop
set matsize 11000

******************************************************************************
* MODEL F: dist/oil model with HS2-specific intercepts and slope coefficients
*
* Results correspond to column (A) in Table A.3
******************************************************************************

areg tradecost logdist c.logdist#i.hs2group logoil c.logoil#i.hs2group c.logdist#c.logoil c.logdist#c.logoil#i.hs2group, absorb(hs2group)
predict hs2groupcoef_F, d
replace hs2groupcoef_F = hs2groupcoef_F + _b[_cons]
gen distcoef_F = 0
gen oilcoef_F = 0
gen distoilcoef_F = 0 
forvalues x=1/100{
cap replace wpvcoef_F = _b[logwpv] + _b[`x'.hs2group#c.logwpv] if hs2group == `x'
cap replace distcoef_F = _b[logdist] + _b[`x'.hs2group#c.logdist] if hs2group == `x'
cap replace oilcoef_F = _b[logoil] + _b[`x'.hs2group#c.logoil] if hs2group == `x'
cap replace distoilcoef_F = _b[c.logdist#c.logoil] + _b[`x'.hs2group#c.logdist#c.logoil] if hs2group == `x'
}
* save aggregate coefficients in table:
outreg2 using "${results_dir}\TableA3.tex", keep(logdist logoil c.logdist#c.logoil) tex append

* save coefficients:
************************************************
drop if tradecost == .
keep h0_hs2 hs2groupcoef* distcoef* oilcoef* distoilcoef*
bys h0_hs2: keep if _n==1
sort h0_hs2
save tradecost_results_2, replace

restore

preserve
keep if kgindex == 1
keep tradecost logwpv logdist logoil year neweu* h0_hs2 hs2group hs2time 
drop if h0_hs2 == ""

set emptycells drop
set matsize 11000

***********************************************************************
* MODEL H: wpv model with interactions, HS-2 intercepts, EU Membership
*
* Results correspond to column (D) in Table A.3
***********************************************************************

areg tradecost i.neweugroup i.neweugroup#i.neweu c.logwpv##c.logdist##c.logoil, absorb(hs2group)
predict hs2groupcoef_H, d
replace hs2groupcoef_H = hs2groupcoef_H + _b[_cons]

gen wpvcoef_H = _b[c.logwpv]
gen distcoef_H = _b[c.logdist]
gen oilcoef_H = _b[c.logoil]
gen distoilcoef_H = _b[c.logdist#c.logoil]
gen wpvoilcoef_H = _b[c.logwpv#c.logoil]
gen wpvdistcoef_H = _b[c.logwpv#c.logdist]
gen wpvdistoilcoef_H = _b[c.logwpv#c.logdist#c.logoil]

outreg2 using "${results_dir}\TableA3.tex", keep(1.neweugroup 1.neweugroup#1.neweu logwpv logdist logoil c.logwpv#c.logdist c.logwpv#c.logoil c.logdist#c.logoil c.logwpv#c.logdist#c.logoil) tex append

* save coefficients:
************************************************
drop if tradecost == .
keep h0_hs2 hs2groupcoef* wpvcoef* distcoef* oilcoef* wpvoilcoef* distoilcoef* wpvdistcoef* wpvdistoilcoef*
bys h0_hs2: keep if _n==1
sort h0_hs2
save tradecost_results_3, replace

restore


keep tradecost logdist logoil year neweu* h0_hs2 hs2group hs2time 
drop if h0_hs2 == ""

set emptycells drop
set matsize 11000

***********************************************************************
* MODEL L: dist/oil model with HS2 intercepts and slopes, EU Membership
*
* Results correspond to column (B) in Table A.3
***********************************************************************

areg tradecost i.neweugroup i.neweugroup#i.neweu logdist c.logdist#i.hs2group logoil c.logoil#i.hs2group c.logdist#c.logoil c.logdist#c.logoil#i.hs2group, absorb(hs2group)
predict hs2groupcoef_L, d
replace hs2groupcoef_L = hs2groupcoef_L + _b[_cons]
gen distcoef_L = 0
gen oilcoef_L = 0
gen distoilcoef_L = 0 
forvalues x=1/100{
cap replace wpvcoef_L = _b[logwpv] + _b[`x'.hs2group#c.logwpv] if hs2group == `x'
cap replace distcoef_L = _b[logdist] + _b[`x'.hs2group#c.logdist] if hs2group == `x'
cap replace oilcoef_L = _b[logoil] + _b[`x'.hs2group#c.logoil] if hs2group == `x'
cap replace distoilcoef_L = _b[c.logdist#c.logoil] + _b[`x'.hs2group#c.logdist#c.logoil] if hs2group == `x'
}
* save aggregate coefficients in table:
outreg2 using "${results_dir}\TableA3.tex", keep(1.neweugroup 1.neweugroup#1.neweu logdist logoil c.logdist#c.logoil) tex append

* save coefficients of all models by HS2 group:
************************************************
drop if tradecost == .
keep h0_hs2 hs2groupcoef* distcoef* oilcoef* distoilcoef*
bys h0_hs2: keep if _n==1
sort h0_hs2
save tradecost_results_4, replace

merge 1:1 h0_hs2 using tradecost_results_1
drop _merge
merge 1:1 h0_hs2 using tradecost_results_2
drop _merge
merge 1:1 h0_hs2 using tradecost_results_3
drop _merge

drop if h0_hs2 == ""
sort h0_hs2
save transport_results_all, replace

log close
